{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "ikEmvQTAT8fE",
    "outputId": "b8e5b582-546e-4613-91df-a82b167c00c2"
   },
   "outputs": [],
   "source": [
    "#!pip install instructor\n",
    "#!pip install openai\n",
    "#!pip install pydantic\n",
    "#!pip install jsonref\n",
    "#!pip install langchain\n",
    "#!pip install -U swifter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "h-nqriGCUQsb"
   },
   "outputs": [],
   "source": [
    "import instructor\n",
    "from pydantic import BaseModel\n",
    "from openai import OpenAI\n",
    "from langchain.prompts import PromptTemplate\n",
    "from tqdm import tqdm\n",
    "import os \n",
    "\n",
    "class Evaluation(BaseModel):\n",
    "    fluency: int\n",
    "    adequacy: int\n",
    "\n",
    "api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "client = instructor.from_openai(OpenAI(api_key =api_key))\n",
    "\n",
    "prompt = \"\"\" You are a helpful language evaluator who can evaluate\n",
    "input sentence2 and provide an evaluation of its fluency with a\n",
    "likert scale rating of 1-5, 5 being highly fluent.\n",
    "You will also have to compate two sentences and judge how adequate\n",
    "is input sentence 2 with respect to input sentence 1, again with a likert scale rating of 1-5, 5 being highly adequate.\n",
    "Here are the sentences: input_sentence1: {input_sentence1}, input_sentence2:{input_sentence2}\"\"\"\n",
    "\n",
    "template = PromptTemplate(\n",
    "    input_variables=[\"input_sentence1\", \"input_sentence2\"],\n",
    "    template=prompt,\n",
    ")\n",
    "\n",
    "def eval_single(row):\n",
    "  i, r = row\n",
    "  input_sentence1, input_sentence2 = r[\"Expected Caption\"], r[\"Generated Caption\"]\n",
    "  final_prompt = template.format(input_sentence1=input_sentence1, input_sentence2=input_sentence2)\n",
    "  eval_info = client.chat.completions.create(\n",
    "    model=\"gpt-4o\",\n",
    "    response_model=Evaluation,\n",
    "    messages=[{\"role\": \"user\", \"content\": final_prompt}],\n",
    "  )\n",
    "  #print (eval_info.model_dump())\n",
    "    \n",
    "  return eval_info.model_dump()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Q3Wn3ELbcVdq",
    "outputId": "98107e8d-f20d-408d-fc0b-751e2abb9ed6"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import multiprocessing\n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "\n",
    "# Function to be applied to each element in the list\n",
    "\n",
    "def get_results_for_single_file(results_dir,file):\n",
    "    file_path = os.path.join(results_dir,file)\n",
    "    df = pd.read_csv(file_path)\n",
    "    num_threads = 8\n",
    "\n",
    "    # Create a ThreadPoolExecutor\n",
    "    results = []\n",
    "    with ThreadPoolExecutor(max_workers=num_threads) as executor:\n",
    "        # Submit tasks to the thread pool\n",
    "        futures = [executor.submit(eval_single, item) for item in df.iterrows()]\n",
    "\n",
    "        # Process the results as they complete\n",
    "        for future in tqdm(as_completed(futures)):\n",
    "            results.append(future.result())\n",
    "\n",
    "    df[\"eval\"] = pd.Series(results)\n",
    "    df.to_csv(file.replace(\"csv\",\"eval.csv\"))\n",
    "    average_adequacy = df[\"eval\"].apply(lambda x: x[\"adequacy\"]).mean()\n",
    "    average_fluency = df[\"eval\"].apply(lambda x: x[\"fluency\"]).mean()\n",
    "    return {\"Avg. Fluency\":average_fluency, \"Avg. Adequacy\": average_adequacy}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "161it [00:22,  7.21it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.6645962732919255, 'Avg. Adequacy': 1.031055900621118}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:32,  9.36it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.46250629089079, 'Avg. Adequacy': 1.516859587317564}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "162it [00:22,  7.36it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.728395061728395, 'Avg. Adequacy': 1.6296296296296295}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "162it [00:19,  8.16it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.617283950617284, 'Avg. Adequacy': 1.1666666666666667}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "161it [00:19,  8.40it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.838509316770186, 'Avg. Adequacy': 1.2670807453416149}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:19,  8.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.54601226993865, 'Avg. Adequacy': 1.2638036809815951}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:19,  8.36it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.852760736196319, 'Avg. Adequacy': 1.6319018404907975}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "162it [00:19,  8.39it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.783950617283951, 'Avg. Adequacy': 1.345679012345679}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [04:06,  8.06it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.823855057876195, 'Avg. Adequacy': 1.5772521389028686}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:18,  8.87it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.7975460122699385, 'Avg. Adequacy': 1.5889570552147239}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:18,  8.83it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.853658536585366, 'Avg. Adequacy': 1.5304878048780488}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:18,  9.03it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.5060975609756095, 'Avg. Adequacy': 1.2073170731707317}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:19,  8.45it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.7926829268292686, 'Avg. Adequacy': 1.1829268292682926}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:34,  9.25it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.7991947659788625, 'Avg. Adequacy': 1.4896829391041773}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:49,  8.68it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.772018117765476, 'Avg. Adequacy': 1.6024157020634122}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:53,  8.50it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.821841972823352, 'Avg. Adequacy': 1.3225968797181682}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:44,  8.83it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.827377956718672, 'Avg. Adequacy': 1.4116758933064921}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:21,  7.76it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.847560975609756, 'Avg. Adequacy': 1.6646341463414633}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:43,  8.89it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 3.4448917966784096, 'Avg. Adequacy': 1.3009562154001006}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:18,  8.66it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.737804878048781, 'Avg. Adequacy': 1.0365853658536586}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:41,  8.97it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.720684448917967, 'Avg. Adequacy': 1.0865626572722697}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:46,  8.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.828384499245093, 'Avg. Adequacy': 1.1071967790639154}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:19,  8.52it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.773006134969325, 'Avg. Adequacy': 1.2208588957055215}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:22,  7.41it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.865853658536586, 'Avg. Adequacy': 1.3109756097560976}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "162it [00:18,  8.64it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.8580246913580245, 'Avg. Adequacy': 1.4938271604938271}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "161it [00:17,  9.39it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.819875776397516, 'Avg. Adequacy': 1.4906832298136645}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "161it [00:18,  8.94it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.683229813664596, 'Avg. Adequacy': 1.1118012422360248}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:19,  8.43it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.871165644171779, 'Avg. Adequacy': 1.6196319018404908}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:17,  9.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.779141104294479, 'Avg. Adequacy': 1.1349693251533743}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:20,  8.04it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.823170731707317, 'Avg. Adequacy': 1.646341463414634}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "162it [00:18,  8.55it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.58641975308642, 'Avg. Adequacy': 1.0308641975308641}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:58,  8.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.7770508303975845, 'Avg. Adequacy': 1.097131353799698}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:20,  8.15it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.668711656441718, 'Avg. Adequacy': 1.1349693251533743}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:46,  8.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.733266230498239, 'Avg. Adequacy': 1.0085556114745848}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:40,  9.03it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.82083543029693, 'Avg. Adequacy': 1.2466029189733265}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:19,  8.42it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.754601226993865, 'Avg. Adequacy': 1.3496932515337423}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:42,  8.94it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.758429793658782, 'Avg. Adequacy': 1.2818319073980875}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:17,  9.35it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.6503067484662575, 'Avg. Adequacy': 1.01840490797546}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:19,  8.61it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.762195121951219, 'Avg. Adequacy': 1.5426829268292683}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "161it [00:18,  8.71it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.788819875776397, 'Avg. Adequacy': 1.5093167701863355}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:45,  8.81it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 3.915450427780574, 'Avg. Adequacy': 1.4735782586814292}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:18,  8.77it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.711656441717792, 'Avg. Adequacy': 1.5582822085889572}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:40,  9.00it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.889783593356819, 'Avg. Adequacy': 1.5515853044791141}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:21,  7.69it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.695121951219512, 'Avg. Adequacy': 1.2134146341463414}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:20,  8.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.487804878048781, 'Avg. Adequacy': 1.2195121951219512}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:20,  8.00it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.682926829268292, 'Avg. Adequacy': 1.1524390243902438}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:20,  8.00it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.50920245398773, 'Avg. Adequacy': 1.2453987730061349}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "163it [00:21,  7.71it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.6871165644171775, 'Avg. Adequacy': 1.0368098159509203}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:54,  8.47it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.791142425767489, 'Avg. Adequacy': 1.646200301962758}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:52,  8.55it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.6607951685958735, 'Avg. Adequacy': 1.2360342224458984}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "162it [00:18,  8.54it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.333333333333333, 'Avg. Adequacy': 1.1728395061728396}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "164it [00:18,  8.73it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.628048780487805, 'Avg. Adequacy': 1.024390243902439}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1987it [03:55,  8.45it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.732259687971816, 'Avg. Adequacy': 1.0452944136889784}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "161it [00:19,  8.16it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Avg. Fluency': 4.53416149068323, 'Avg. Adequacy': 1.2422360248447204}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "results_dir = \"../results\"\n",
    "all_res = {}\n",
    "for file in os.listdir(results_dir):\n",
    "    results = get_results_for_single_file(results_dir, file)\n",
    "    print(results)\n",
    "    all_res[file.replace(\"csv\",\"\")] = results\n",
    "\n",
    "results_df = pd.DataFrame(all_res).transpose()\n",
    "results_df.to_csv(\"all_gpt4_results.csv\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
